In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter  
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from textwrap import wrap
import matplotlib.patches as mpatches
from mpl_toolkits.axes_grid1 import make_axes_locatable
import re
%matplotlib inline

In [2]:
df_match = pd.read_csv('dataset.csv')
df_match.head()


Out[2]:
stage home_team_goal away_team_goal possession_home B365H B365D B365A BWH BWD BWA ... shoton_home shoton_away shotoff_home shotoff_away foulcommit_home foulcommit_away ycard_home ycard_away rcard_home rcard_away
0 2 0 2 57 1.83 3.50 4.33 1.80 3.60 4.00 ... 10 4 6 4 12 12 3 2 0 0
1 21 1 2 57 3.60 3.30 2.10 3.45 3.30 2.05 ... 6 9 6 8 10 18 2 1 0 0
2 21 1 3 60 1.80 3.60 4.50 1.72 3.60 4.50 ... 8 0 4 1 5 6 0 2 0 0
3 21 1 1 51 2.50 3.30 2.88 2.45 3.20 2.75 ... 4 3 5 6 12 10 2 0 0 0
4 21 4 0 53 2.10 3.25 3.75 2.00 3.25 3.65 ... 4 6 5 6 13 10 2 2 0 1

5 rows × 60 columns


In [3]:
for columnName in df_match:
    print ('Column:', columnName)
    column = df_match[columnName]
    if np.issubdtype(column.dtype, np.number):
        q25 = column.quantile(.25)
        q75 = column.quantile(.75)
        amplitude = 1.5 * (q75 - q25)
        low_limit = q25 - amplitude
        high_limit = q75 + amplitude
        print ('Median:', column.median())
        print ('25%/75%:', q25, q75)
        print ('Outliers limits:', low_limit, high_limit)
        print ('Outliers:', column[column < low_limit].count(), column[column > high_limit].count())
        print ('Count:', column.count())
        print ('Mean:', column.mean())
        print ('Variance:', column.var())
        print ('std:', column.std())
        print ('min:', column.min())
        print ('max:', column.max())
    print ('number of unique values:', len(column.unique()))
    if len(column.unique()) == 2:
        print ('value counts: ', column.value_counts().values)
    print ('mode:', column.mode().values)
    print ()


Column: stage
Median: 20.0
25%/75%: 10.0 30.0
Outliers limits: -20.0 60.0
Outliers: 0 0
Count: 2393
Mean: 20.119097367321356
Variance: 122.97786674344242
std: 11.08953861724835
min: 1
max: 38
number of unique values: 38
mode: [28 29 30 31 32 33 34 36 37 38]

Column: home_team_goal
Median: 1.0
25%/75%: 1.0 2.0
Outliers limits: -0.5 3.5
Outliers: 0 180
Count: 2393
Mean: 1.557041370664438
Variance: 1.7226019451941001
std: 1.3124793122918548
min: 0
max: 8
number of unique values: 9
mode: [1]

Column: away_team_goal
Median: 1.0
25%/75%: 0.0 2.0
Outliers limits: -3.0 5.0
Outliers: 0 8
Count: 2393
Mean: 1.182198077726703
Variance: 1.3196327918524957
std: 1.1487527113580607
min: 0
max: 6
number of unique values: 7
mode: [0]

Column: possession_home
Median: 52.0
25%/75%: 46.0 58.0
Outliers limits: 28.0 76.0
Outliers: 8 8
Count: 2393
Mean: 51.63727538654409
Variance: 81.7270715031439
std: 9.040302622320997
min: 20
max: 80
number of unique values: 55
mode: [52]

Column: B365H
Median: 2.2
25%/75%: 1.67 3.0
Outliers limits: -0.3250000000000002 4.995
Outliers: 0 253
Count: 2393
Mean: 2.681847053907221
Variance: 2.6014461689053845
std: 1.612899925260518
min: 1.1
max: 15.0
number of unique values: 91
mode: [ 2.1]

Column: B365D
Median: 3.6
25%/75%: 3.4 4.2
Outliers limits: 2.1999999999999993 5.4
Outliers: 0 245
Count: 2393
Mean: 3.9843042206435304
Variance: 1.032596432494729
std: 1.0161675218657251
min: 3.0
max: 11.0
number of unique values: 37
mode: [ 3.4]

Column: B365A
Median: 3.6
25%/75%: 2.45 5.5
Outliers limits: -2.124999999999999 10.075
Outliers: 0 228
Count: 2393
Mean: 4.8726243209360565
Variance: 14.474156989658999
std: 3.8044916861072253
min: 1.22
max: 29.0
number of unique values: 91
mode: [ 4.5]

Column: BWH
Median: 2.15
25%/75%: 1.65 2.9
Outliers limits: -0.2250000000000001 4.775
Outliers: 0 227
Count: 2393
Mean: 2.596205599665684
Variance: 2.1912273189151232
std: 1.4802794732465634
min: 1.1
max: 12.5
number of unique values: 127
mode: [ 2.1]

Column: BWD
Median: 3.5
25%/75%: 3.3 4.0
Outliers limits: 2.2499999999999996 5.050000000000001
Outliers: 0 229
Count: 2393
Mean: 3.8349644797325553
Variance: 0.7970934874501535
std: 0.8928009226306576
min: 2.9
max: 9.25
number of unique values: 50
mode: [ 3.4]

Column: BWA
Median: 3.4
25%/75%: 2.4 5.25
Outliers limits: -1.8750000000000004 9.525
Outliers: 0 219
Count: 2393
Mean: 4.50610530714585
Variance: 10.32367194877187
std: 3.2130471438763344
min: 1.22
max: 21.0
number of unique values: 134
mode: [ 3.1]

Column: IWH
Median: 2.1
25%/75%: 1.65 2.75
Outliers limits: -2.220446049250313e-16 4.4
Outliers: 0 234
Count: 2393
Mean: 2.500330129544508
Variance: 1.7404794478251069
std: 1.3192723175391452
min: 1.05
max: 10.0
number of unique values: 101
mode: [ 2.   2.2]

Column: IWD
Median: 3.4
25%/75%: 3.3 3.9
Outliers limits: 2.3999999999999995 4.8
Outliers: 0 228
Count: 2393
Mean: 3.7065608023401784
Variance: 0.5566479913893215
std: 0.7460884608337818
min: 3.0
max: 10.0
number of unique values: 45
mode: [ 3.3]

Column: IWA
Median: 3.3
25%/75%: 2.4 5.0
Outliers limits: -1.5000000000000004 8.9
Outliers: 0 220
Count: 2393
Mean: 4.215917258671138
Variance: 8.227673583452011
std: 2.8683921599830122
min: 1.27
max: 25.0
number of unique values: 109
mode: [ 3.3]

Column: LBH
Median: 2.15
25%/75%: 1.66 2.88
Outliers limits: -0.17000000000000015 4.71
Outliers: 0 245
Count: 2393
Mean: 2.600735478478877
Variance: 2.1605441076747067
std: 1.4698789432040678
min: 1.08
max: 12.0
number of unique values: 95
mode: [ 2.2]

Column: LBD
Median: 3.5
25%/75%: 3.3 4.0
Outliers limits: 2.2499999999999996 5.050000000000001
Outliers: 0 214
Count: 2393
Mean: 3.8464855829502613
Variance: 0.7947015651139706
std: 0.8914603553237634
min: 2.9
max: 10.0
number of unique values: 30
mode: [ 3.3]

Column: LBA
Median: 3.4
25%/75%: 2.4 5.5
Outliers limits: -2.2500000000000004 10.15
Outliers: 0 185
Count: 2393
Mean: 4.5796239030505514
Variance: 11.37567895548192
std: 3.3727850443634737
min: 1.22
max: 26.0
number of unique values: 94
mode: [ 4.]

Column: WHH
Median: 2.2
25%/75%: 1.67 2.9
Outliers limits: -0.17500000000000004 4.745
Outliers: 0 258
Count: 2393
Mean: 2.646126201420797
Variance: 2.295861701457856
std: 1.515210117923536
min: 1.1
max: 12.0
number of unique values: 82
mode: [ 2.5]

Column: WHD
Median: 3.3
25%/75%: 3.2 3.8
Outliers limits: 2.3000000000000007 4.699999999999999
Outliers: 0 266
Count: 2393
Mean: 3.654429586293388
Variance: 0.6587026949771349
std: 0.8116050116757134
min: 2.8
max: 9.5
number of unique values: 34
mode: [ 3.1]

Column: WHA
Median: 3.4
25%/75%: 2.5 5.5
Outliers limits: -2.0 10.0
Outliers: 0 203
Count: 2393
Mean: 4.669820309235278
Variance: 12.561206614854882
std: 3.5441792582846148
min: 1.22
max: 26.0
number of unique values: 80
mode: [ 4.]

Column: VCH
Median: 2.2
25%/75%: 1.67 3.0
Outliers limits: -0.3250000000000002 4.995
Outliers: 0 263
Count: 2393
Mean: 2.709289176765556
Variance: 2.70303886617461
std: 1.64409211000315
min: 1.09
max: 15.0
number of unique values: 110
mode: [ 2.15]

Column: VCD
Median: 3.6
25%/75%: 3.4 4.2
Outliers limits: 2.1999999999999993 5.4
Outliers: 0 237
Count: 2393
Mean: 4.005695779356446
Variance: 1.0565690077455585
std: 1.0278954264639757
min: 2.5
max: 10.0
number of unique values: 41
mode: [ 3.4]

Column: VCA
Median: 3.6
25%/75%: 2.5 5.75
Outliers limits: -2.375 10.625
Outliers: 0 232
Count: 2393
Mean: 4.987499791057238
Variance: 15.95730613890047
std: 3.994659702515406
min: 1.22
max: 29.0
number of unique values: 110
mode: [ 3.]

Column: buildUpPlaySpeed_home
Median: 58.0
25%/75%: 48.0 65.0
Outliers limits: 22.5 90.5
Outliers: 0 0
Count: 2393
Mean: 56.29878813205182
Variance: 122.18033715952501
std: 11.05352148229355
min: 25
max: 77
number of unique values: 37
mode: [60]

Column: buildUpPlayDribbling_home
Median: 32.0
25%/75%: 32.0 33.0
Outliers limits: 30.5 34.5
Outliers: 21 496
Count: 2393
Mean: 34.04220643543669
Variance: 24.56468944398864
std: 4.956277781156807
min: 24.0
max: 60.0
number of unique values: 21
mode: [ 32.]

Column: buildUpPlayPassing_home
Median: 52.0
25%/75%: 44.0 61.0
Outliers limits: 18.5 86.5
Outliers: 0 0
Count: 2393
Mean: 52.23234433765148
Variance: 165.86321762051097
std: 12.87878944701368
min: 24
max: 80
number of unique values: 44
mode: [70]

Column: buildUpPlayPositioningClass_home_isOrganised
number of unique values: 2
value counts:  [2284  109]
mode: [ True]

Column: chanceCreationPassing_home
Median: 49.0
25%/75%: 42.0 59.0
Outliers limits: 16.5 84.5
Outliers: 0 0
Count: 2393
Mean: 50.62264939406602
Variance: 123.69157709149034
std: 11.121671506185136
min: 28
max: 72
number of unique values: 37
mode: [70]

Column: chanceCreationCrossing_home
Median: 59.0
25%/75%: 50.0 69.0
Outliers limits: 21.5 97.5
Outliers: 0 0
Count: 2393
Mean: 57.31257835353113
Variance: 130.84205325734112
std: 11.438621125701346
min: 30
max: 76
number of unique values: 39
mode: [70]

Column: chanceCreationShooting_home
Median: 54.0
25%/75%: 46.0 60.0
Outliers limits: 25.0 81.0
Outliers: 17 0
Count: 2393
Mean: 52.34308399498537
Variance: 120.20791306024928
std: 10.963936932518779
min: 24
max: 80
number of unique values: 42
mode: [55]

Column: chanceCreationPositioningClass_home_isOrganised
number of unique values: 2
value counts:  [1960  433]
mode: [ True]

Column: defencePressure_home
Median: 44.0
25%/75%: 38.0 50.0
Outliers limits: 20.0 68.0
Outliers: 0 70
Count: 2393
Mean: 44.873798579189305
Variance: 92.94644112496454
std: 9.640873462760755
min: 25
max: 70
number of unique values: 34
mode: [40]

Column: defenceAggression_home
Median: 50.0
25%/75%: 42.0 58.0
Outliers limits: 18.0 82.0
Outliers: 0 0
Count: 2393
Mean: 50.507730881738404
Variance: 112.24251963992022
std: 10.594457024308523
min: 27
max: 70
number of unique values: 32
mode: [70]

Column: defenceTeamWidth_home
Median: 51.0
25%/75%: 45.0 56.0
Outliers limits: 28.5 72.5
Outliers: 0 0
Count: 2393
Mean: 50.772670288341
Variance: 72.26100862744737
std: 8.500647541655129
min: 30
max: 70
number of unique values: 27
mode: [50]

Column: buildUpPlaySpeed_away
Median: 58.0
25%/75%: 48.0 65.0
Outliers limits: 22.5 90.5
Outliers: 0 0
Count: 2393
Mean: 56.27455077308817
Variance: 122.44507531023484
std: 11.065490287837898
min: 25
max: 77
number of unique values: 37
mode: [60]

Column: buildUpPlayDribbling_away
Median: 32.0
25%/75%: 32.0 33.0
Outliers limits: 30.5 34.5
Outliers: 21 496
Count: 2393
Mean: 34.035938152946095
Variance: 24.454393527945918
std: 4.945138372982693
min: 24.0
max: 60.0
number of unique values: 21
mode: [ 32.]

Column: buildUpPlayPassing_away
Median: 52.0
25%/75%: 44.0 61.0
Outliers limits: 18.5 86.5
Outliers: 0 0
Count: 2393
Mean: 52.31383201002925
Variance: 165.53984622093122
std: 12.866228904419943
min: 24
max: 80
number of unique values: 44
mode: [70]

Column: buildUpPlayPositioningClass_away_isOrganised
number of unique values: 2
value counts:  [2283  110]
mode: [ True]

Column: chanceCreationPassing_away
Median: 49.0
25%/75%: 42.0 59.0
Outliers limits: 16.5 84.5
Outliers: 0 0
Count: 2393
Mean: 50.54492269118261
Variance: 123.46296856634588
std: 11.111389137562679
min: 28
max: 72
number of unique values: 37
mode: [70]

Column: chanceCreationCrossing_away
Median: 59.0
25%/75%: 50.0 69.0
Outliers limits: 21.5 97.5
Outliers: 0 0
Count: 2393
Mean: 57.31257835353113
Variance: 130.00426061520065
std: 11.401941089796976
min: 30
max: 76
number of unique values: 39
mode: [70]

Column: chanceCreationShooting_away
Median: 54.0
25%/75%: 45.0 60.0
Outliers limits: 22.5 82.5
Outliers: 0 0
Count: 2393
Mean: 52.309235269536146
Variance: 119.9076780520671
std: 10.950236438181008
min: 24
max: 80
number of unique values: 42
mode: [55]

Column: chanceCreationPositioningClass_away_isOrganised
number of unique values: 2
value counts:  [1959  434]
mode: [ True]

Column: defencePressure_away
Median: 44.0
25%/75%: 38.0 50.0
Outliers limits: 20.0 68.0
Outliers: 0 71
Count: 2393
Mean: 44.82490597576264
Variance: 93.36606839625526
std: 9.662611882728978
min: 25
max: 70
number of unique values: 34
mode: [40]

Column: defenceAggression_away
Median: 50.0
25%/75%: 42.0 58.0
Outliers limits: 18.0 82.0
Outliers: 0 0
Count: 2393
Mean: 50.541997492687
Variance: 112.43563095818878
std: 10.603566897897554
min: 27
max: 70
number of unique values: 32
mode: [70]

Column: defenceTeamWidth_away
Median: 51.0
25%/75%: 45.0 56.0
Outliers limits: 28.5 72.5
Outliers: 0 0
Count: 2393
Mean: 50.76264103635604
Variance: 72.4670485404062
std: 8.512757986716537
min: 30
max: 70
number of unique values: 27
mode: [50]

Column: corner_home
Median: 6.0
25%/75%: 4.0 8.0
Outliers limits: -2.0 14.0
Outliers: 0 28
Count: 2393
Mean: 6.119097367321354
Variance: 10.007130957488744
std: 3.163404962613662
min: 0
max: 19
number of unique values: 20
mode: [4]

Column: corner_away
Median: 4.0
25%/75%: 3.0 6.0
Outliers limits: -1.5 10.5
Outliers: 0 77
Count: 2393
Mean: 4.794818219807772
Variance: 7.336227318530784
std: 2.7085470862679837
min: 0
max: 19
number of unique values: 19
mode: [4]

Column: cross_home
Median: 16.0
25%/75%: 11.0 21.0
Outliers limits: -4.0 36.0
Outliers: 0 43
Count: 2393
Mean: 16.709151692436272
Variance: 58.000656178067544
std: 7.615816185942748
min: 0
max: 64
number of unique values: 50
mode: [12]

Column: cross_away
Median: 12.0
25%/75%: 8.0 16.0
Outliers limits: -4.0 28.0
Outliers: 0 45
Count: 2393
Mean: 12.91809444212286
Variance: 40.065195029538785
std: 6.329707341539479
min: 0
max: 50
number of unique values: 43
mode: [14]

Column: throwin_home
Median: 0.0
25%/75%: 0.0 0.0
Outliers limits: 0.0 0.0
Outliers: 0 533
Count: 2393
Mean: 0.5449226911826159
Variance: 2.41614582387032
std: 1.554395645860577
min: 0
max: 14
number of unique values: 15
mode: [0]

Column: throwin_away
Median: 0.0
25%/75%: 0.0 0.0
Outliers limits: 0.0 0.0
Outliers: 0 484
Count: 2393
Mean: 0.42498955286251566
Variance: 1.5412983381016483
std: 1.2414903697176423
min: 0
max: 25
number of unique values: 13
mode: [0]

Column: shoton_home
Median: 6.0
25%/75%: 4.0 9.0
Outliers limits: -3.5 16.5
Outliers: 0 33
Count: 2393
Mean: 6.659423318010865
Variance: 12.258122911445968
std: 3.5011602236181605
min: 0
max: 26
number of unique values: 26
mode: [7]

Column: shoton_away
Median: 5.0
25%/75%: 3.0 7.0
Outliers limits: -3.0 13.0
Outliers: 0 27
Count: 2393
Mean: 5.2741328875888005
Variance: 8.516792987350222
std: 2.9183544999451696
min: 0
max: 19
number of unique values: 19
mode: [4]

Column: shotoff_home
Median: 6.0
25%/75%: 4.0 8.0
Outliers limits: -2.0 14.0
Outliers: 0 30
Count: 2393
Mean: 6.55077308817384
Variance: 9.426455296733632
std: 3.0702532952076824
min: 0
max: 21
number of unique values: 21
mode: [6]

Column: shotoff_away
Median: 5.0
25%/75%: 3.0 7.0
Outliers limits: -3.0 13.0
Outliers: 0 10
Count: 2393
Mean: 5.14625992478061
Variance: 7.085622502644928
std: 2.6618832623999364
min: 0
max: 19
number of unique values: 18
mode: [4]

Column: foulcommit_home
Median: 10.0
25%/75%: 8.0 13.0
Outliers limits: 0.5 20.5
Outliers: 0 12
Count: 2393
Mean: 10.470956957793565
Variance: 11.955782403246435
std: 3.457713464595705
min: 1
max: 23
number of unique values: 23
mode: [10]

Column: foulcommit_away
Median: 11.0
25%/75%: 9.0 13.0
Outliers limits: 3.0 19.0
Outliers: 11 36
Count: 2393
Mean: 11.106560802340159
Variance: 12.964810616807263
std: 3.6006680792329724
min: 1
max: 25
number of unique values: 25
mode: [10]

Column: ycard_home
Median: 1.0
25%/75%: 1.0 2.0
Outliers limits: -0.5 3.5
Outliers: 0 135
Count: 2393
Mean: 1.445048056832428
Variance: 1.375009957973848
std: 1.1726081860424853
min: 0
max: 7
number of unique values: 8
mode: [1]

Column: ycard_away
Median: 2.0
25%/75%: 1.0 3.0
Outliers limits: -2.0 6.0
Outliers: 0 5
Count: 2393
Mean: 1.8119515252820728
Variance: 1.6811781715622893
std: 1.2966025495741897
min: 0
max: 9
number of unique values: 10
mode: [2]

Column: rcard_home
Median: 0.0
25%/75%: 0.0 0.0
Outliers limits: 0.0 0.0
Outliers: 0 151
Count: 2393
Mean: 0.0651901379022148
Variance: 0.06514646257828496
std: 0.25523805080411693
min: 0
max: 2
number of unique values: 3
mode: [0]

Column: rcard_away
Median: 0.0
25%/75%: 0.0 0.0
Outliers limits: 0.0 0.0
Outliers: 0 217
Count: 2393
Mean: 0.09569577935645633
Variance: 0.09660772012013838
std: 0.3108178246499682
min: 0
max: 2
number of unique values: 3
mode: [0]


In [4]:
rows = df_match['home_team_goal'].count()
print ('Home win:', np.sum(np.where(df_match['home_team_goal'] >  df_match['away_team_goal'], 1, 0))/rows)
print ('Away win:', np.sum(np.where(df_match['home_team_goal'] == df_match['away_team_goal'], 1, 0))/rows)
print ('Draw:', np.sum(np.where(df_match['home_team_goal'] <  df_match['away_team_goal'], 1, 0))/rows)


Home win: 0.450898453824
Away win: 0.257835353113
Draw: 0.291266193063

Does attributes normal distributed


In [5]:
##Confirm the normal distribution against each column values

##use scipy function for test
for columnName in df_match:
    print (columnName, " :Shapiro ", stats.shapiro(df_match[columnName].tolist()))
    print (columnName, " :Pearson ", stats.normaltest(df_match[columnName].tolist()), "\n")


stage  :Shapiro  (0.9491441249847412, 4.976150382759686e-28)
stage  :Pearson  NormaltestResult(statistic=2583.569665331433, pvalue=0.0) 

home_team_goal  :Shapiro  (0.8820047378540039, 2.5016456626604505e-39)
home_team_goal  :Pearson  NormaltestResult(statistic=386.51427308665245, pvalue=1.1735240943905259e-84) 

away_team_goal  :Shapiro  (0.8515219688415527, 9.234556879900544e-43)
away_team_goal  :Pearson  NormaltestResult(statistic=329.52383347285325, pvalue=2.7848944108583536e-72) 

possession_home  :Shapiro  (0.9982931017875671, 0.013109136372804642)
possession_home  :Pearson  NormaltestResult(statistic=1.2666284015150224, pvalue=0.53082960660255485) 

B365H  :Shapiro  (0.7721923589706421, 0.0)
B365H  :Pearson  NormaltestResult(statistic=1062.2772677656692, pvalue=2.1351194610956052e-231) 

B365D  :Shapiro  (0.7055726647377014, 0.0)
B365D  :Pearson  NormaltestResult(statistic=1204.0201761472408, pvalue=3.5509184588211322e-262) 

B365A  :Shapiro  (0.7656803727149963, 0.0)
B365A  :Pearson  NormaltestResult(statistic=992.7583320161865, pvalue=2.6623630114695299e-216) 

BWH  :Shapiro  (0.7883105278015137, 0.0)
BWH  :Pearson  NormaltestResult(statistic=970.60477013539207, pvalue=1.7212823305197741e-211) 

BWD  :Shapiro  (0.7190874814987183, 0.0)
BWD  :Pearson  NormaltestResult(statistic=1138.0353132858663, pvalue=7.5643499852541646e-248) 

BWA  :Shapiro  (0.7951842546463013, 0.0)
BWA  :Pearson  NormaltestResult(statistic=833.59831178517993, pvalue=9.6922929875025335e-182) 

IWH  :Shapiro  (0.7991344332695007, 0.0)
IWH  :Pearson  NormaltestResult(statistic=871.6187342755245, pvalue=5.3752255241044812e-190) 

IWD  :Shapiro  (0.6962065696716309, 0.0)
IWD  :Pearson  NormaltestResult(statistic=1223.4658598107096, pvalue=2.1269836111946757e-266) 

IWA  :Shapiro  (0.7896069884300232, 0.0)
IWA  :Pearson  NormaltestResult(statistic=873.77021763829509, pvalue=1.8331922567047914e-190) 

LBH  :Shapiro  (0.7887029647827148, 0.0)
LBH  :Pearson  NormaltestResult(statistic=948.50850134928987, pvalue=1.0814237739348853e-206) 

LBD  :Shapiro  (0.6944655179977417, 0.0)
LBD  :Pearson  NormaltestResult(statistic=1209.2623015424833, pvalue=2.5824244996002049e-263) 

LBA  :Shapiro  (0.7776610851287842, 0.0)
LBA  :Pearson  NormaltestResult(statistic=968.1307041045967, pvalue=5.9304646718680628e-211) 

WHH  :Shapiro  (0.7849525809288025, 0.0)
WHH  :Pearson  NormaltestResult(statistic=966.91239480074, pvalue=1.0905390642853889e-210) 

WHD  :Shapiro  (0.7172492742538452, 0.0)
WHD  :Pearson  NormaltestResult(statistic=1238.1180134936733, pvalue=1.3998754502571586e-269) 

WHA  :Shapiro  (0.7635395526885986, 0.0)
WHA  :Pearson  NormaltestResult(statistic=1011.6942408596269, pvalue=2.0577229942785253e-220) 

VCH  :Shapiro  (0.7691025733947754, 0.0)
VCH  :Pearson  NormaltestResult(statistic=1082.4851409042999, pvalue=8.736516098308038e-236) 

VCD  :Shapiro  (0.7186394929885864, 0.0)
VCD  :Pearson  NormaltestResult(statistic=1158.3234867083397, pvalue=2.9733821630209425e-252) 

VCA  :Shapiro  (0.7555606365203857, 0.0)
VCA  :Pearson  NormaltestResult(statistic=1039.6922768452625, pvalue=1.7127368600417171e-226) 

buildUpPlaySpeed_home  :Shapiro  (0.9676266312599182, 8.611160203004424e-23)
buildUpPlaySpeed_home  :Pearson  NormaltestResult(statistic=102.05115819615686, pvalue=6.9162799166805717e-23) 

buildUpPlayDribbling_home  :Shapiro  (0.5471799373626709, 0.0)
buildUpPlayDribbling_home  :Pearson  NormaltestResult(statistic=1314.7531888789711, pvalue=3.1986925775778879e-286) 

buildUpPlayPassing_home  :Shapiro  (0.9792335033416748, 3.34127448885791e-18)
buildUpPlayPassing_home  :Pearson  NormaltestResult(statistic=145.1222403368626, pvalue=3.0697705131193659e-32) 

buildUpPlayPositioningClass_home_isOrganised  :Shapiro  (0.21066886186599731, 0.0)
buildUpPlayPositioningClass_home_isOrganised  :Pearson  NormaltestResult(statistic=2078.2749066704437, pvalue=0.0) 

chanceCreationPassing_home  :Shapiro  (0.9673987030982971, 7.21257923780723e-23)
chanceCreationPassing_home  :Pearson  NormaltestResult(statistic=161.06749358282912, pvalue=1.0583715607067419e-35) 

chanceCreationCrossing_home  :Shapiro  (0.9470035433769226, 1.5490455561644483e-28)
chanceCreationCrossing_home  :Pearson  NormaltestResult(statistic=222.0070703220681, pvalue=6.1912338455259772e-49) 

chanceCreationShooting_home  :Shapiro  (0.987002432346344, 5.832719696769947e-14)
chanceCreationShooting_home  :Pearson  NormaltestResult(statistic=20.426532635011853, pvalue=3.6680461916719129e-05) 

chanceCreationPositioningClass_home_isOrganised  :Shapiro  (0.46759361028671265, 0.0)
chanceCreationPositioningClass_home_isOrganised  :Pearson  NormaltestResult(statistic=601.59064195885423, pvalue=2.3240845110151482e-131) 

defencePressure_home  :Shapiro  (0.9683494567871094, 1.5200173294227032e-22)
defencePressure_home  :Pearson  NormaltestResult(statistic=99.994898474588979, pvalue=1.9336759110825025e-22) 

defenceAggression_home  :Shapiro  (0.9583277106285095, 1.195638263593391e-25)
defenceAggression_home  :Pearson  NormaltestResult(statistic=300.55659492825259, pvalue=5.4320612908203278e-66) 

defenceTeamWidth_home  :Shapiro  (0.9878752827644348, 2.2017789338708155e-13)
defenceTeamWidth_home  :Pearson  NormaltestResult(statistic=8.6914777929939859, pvalue=0.012961927185905346) 

buildUpPlaySpeed_away  :Shapiro  (0.9678520560264587, 1.0269942204623338e-22)
buildUpPlaySpeed_away  :Pearson  NormaltestResult(statistic=99.837395204130758, pvalue=2.0921127568198183e-22) 

buildUpPlayDribbling_away  :Shapiro  (0.5469616651535034, 0.0)
buildUpPlayDribbling_away  :Pearson  NormaltestResult(statistic=1318.6832305990965, pvalue=4.4830624242800618e-287) 

buildUpPlayPassing_away  :Shapiro  (0.9796109199523926, 5.029788760685361e-18)
buildUpPlayPassing_away  :Pearson  NormaltestResult(statistic=139.37935876843181, pvalue=5.4219660991758487e-31) 

buildUpPlayPositioningClass_away_isOrganised  :Shapiro  (0.21193379163742065, 0.0)
buildUpPlayPositioningClass_away_isOrganised  :Pearson  NormaltestResult(statistic=2068.7312284896402, pvalue=0.0) 

chanceCreationPassing_away  :Shapiro  (0.9679208993911743, 1.0840055767358908e-22)
chanceCreationPassing_away  :Pearson  NormaltestResult(statistic=163.20086633137862, pvalue=3.642354022485884e-36) 

chanceCreationCrossing_away  :Shapiro  (0.9473085403442383, 1.8251610767251423e-28)
chanceCreationCrossing_away  :Pearson  NormaltestResult(statistic=226.52424555339633, pvalue=6.4697138025888606e-50) 

chanceCreationShooting_away  :Shapiro  (0.9870577454566956, 6.333509969735571e-14)
chanceCreationShooting_away  :Pearson  NormaltestResult(statistic=21.499228887713905, pvalue=2.1453678319678604e-05) 

chanceCreationPositioningClass_away_isOrganised  :Shapiro  (0.4681028127670288, 0.0)
chanceCreationPositioningClass_away_isOrganised  :Pearson  NormaltestResult(statistic=599.28241501575724, pvalue=7.3701619069711684e-131) 

defencePressure_away  :Shapiro  (0.968295693397522, 1.4566109617804453e-22)
defencePressure_away  :Pearson  NormaltestResult(statistic=99.580561861132324, pvalue=2.3787880024395615e-22) 

defenceAggression_away  :Shapiro  (0.9584013819694519, 1.2539077208765739e-25)
defenceAggression_away  :Pearson  NormaltestResult(statistic=303.56344297667169, pvalue=1.2079136911917642e-66) 

defenceTeamWidth_away  :Shapiro  (0.9880092144012451, 2.7143475726625066e-13)
defenceTeamWidth_away  :Pearson  NormaltestResult(statistic=7.4848786716298221, pvalue=0.02369622951207918) 

corner_home  :Shapiro  (0.9672448635101318, 6.402908181906488e-23)
corner_home  :Pearson  NormaltestResult(statistic=144.57031518174702, pvalue=4.0453391244661038e-32) 

corner_away  :Shapiro  (0.9532545804977417, 5.224283730120638e-27)
corner_away  :Pearson  NormaltestResult(statistic=267.41705106791517, pvalue=8.5334603945348468e-59) 

cross_home  :Shapiro  (0.965386688709259, 1.5683941613265093e-23)
cross_home  :Pearson  NormaltestResult(statistic=303.18516548964891, pvalue=1.4594116758788932e-66) 

cross_away  :Shapiro  (0.9658812880516052, 2.2679489123283556e-23)
cross_away  :Pearson  NormaltestResult(statistic=261.58229627134216, pvalue=1.5780693428924472e-57) 

throwin_home  :Shapiro  (0.39515167474746704, 0.0)
throwin_home  :Pearson  NormaltestResult(statistic=2288.7587510467574, pvalue=0.0) 

throwin_away  :Shapiro  (0.37900757789611816, 0.0)
throwin_away  :Pearson  NormaltestResult(statistic=3015.1913817865766, pvalue=0.0) 

shoton_home  :Shapiro  (0.9478259086608887, 2.4147757421439233e-28)
shoton_home  :Pearson  NormaltestResult(statistic=352.03851478227165, pvalue=3.5959016608001903e-77) 

shoton_away  :Shapiro  (0.9563610553741455, 3.4382308420319405e-26)
shoton_away  :Pearson  NormaltestResult(statistic=222.1577511367617, pvalue=5.7419219473914491e-49) 

shotoff_home  :Shapiro  (0.9696898460388184, 4.474362508863893e-22)
shotoff_home  :Pearson  NormaltestResult(statistic=130.86725203813305, pvalue=3.8241708699714146e-29) 

shotoff_away  :Shapiro  (0.963733971118927, 4.702830968507155e-24)
shotoff_away  :Pearson  NormaltestResult(statistic=165.31288423789977, pvalue=1.2669611385343927e-36) 

foulcommit_home  :Shapiro  (0.9873665571212769, 1.0076345953371341e-13)
foulcommit_home  :Pearson  NormaltestResult(statistic=28.879863533203288, pvalue=5.355713228840061e-07) 

foulcommit_away  :Shapiro  (0.9881348609924316, 3.308059121770457e-13)
foulcommit_away  :Pearson  NormaltestResult(statistic=35.423758642669, pvalue=2.0315548231485984e-08) 

ycard_home  :Shapiro  (0.8909401893615723, 3.4700981653787463e-38)
ycard_home  :Pearson  NormaltestResult(statistic=175.2761903092773, pvalue=8.6947854476346784e-39) 

ycard_away  :Shapiro  (0.9171902537345886, 2.4883553184427367e-34)
ycard_away  :Pearson  NormaltestResult(statistic=147.59988804664445, pvalue=8.8938857430093246e-33) 

rcard_home  :Shapiro  (0.2615692615509033, 0.0)
rcard_home  :Pearson  NormaltestResult(statistic=1907.1407339477614, pvalue=0.0) 

rcard_away  :Shapiro  (0.3273508548736572, 0.0)
rcard_away  :Pearson  NormaltestResult(statistic=1603.8397120222235, pvalue=0.0) 


In [6]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(6,4))
##Most not norm distributed, possesion seems to be. plot it in histogram.
phhist, phbins = np.histogram(df_match['possession_home'].tolist(), bins=25)  # plt.hist passes it's arguments to np.histogram
phwidth = 0.7 * (phbins[1] - phbins[0])
phcenter = (phbins[:-1] + phbins[1:]) / 2 # bin edges for the x-axis, a function of bins 


plt.figure(1)
plt.subplot(221)
plt.bar(phcenter, phhist, align='center', width=phwidth)
plt.xlabel('Possesion rate')
plt.ylabel('No. of games')
plt.title('Possesion')



dtwhist, dtwbins = np.histogram(df_match['defenceTeamWidth_away'].tolist(), bins=25)  # plt.hist passes it's arguments to np.histogram
dtwwidth = 0.7 * (dtwbins[1] - dtwbins[0])
dtwcenter = (dtwbins[:-1] + dtwbins[1:]) / 2 # bin edges for the x-axis, a function of bins 
plt.subplot(222)
plt.bar(dtwcenter, dtwhist, align='center', width=dtwwidth)


plt.xlabel('defenceTeamWidth')
plt.ylabel('Rate')
plt.title('defenceTeamWidth_away')


bhhist, bhbins = np.histogram(df_match['B365H'].tolist(), bins=25)  # plt.hist passes it's arguments to np.histogram
bhwidth = 0.7 * (bhbins[1] - bhbins[0])
bhcenter = (bhbins[:-1] + bhbins[1:]) / 2 # bin edges for the x-axis, a function of bins 
plt.subplot(223)
plt.bar(bhcenter, bhhist, align='center', width=bhwidth)
plt.xlabel('B365 rate')
plt.ylabel('No. of game')
plt.title('B365 game rating')


yhhist, yhbins = np.histogram(df_match['ycard_home'].tolist(), bins=25)  # plt.hist passes it's arguments to np.histogram
yhwidth = 0.7 * (yhbins[1] - yhbins[0])
yhcenter = (yhbins[:-1] + yhbins[1:]) / 2 # bin edges for the x-axis, a function of bins 
plt.subplot(224)
plt.bar(yhcenter, yhhist, align='center', width=yhwidth)
plt.xlabel('Yellow card')
plt.ylabel('No. of game')
plt.title('Yellow card home')

plt.tight_layout()
plt.savefig('normal.pdf')
plt.show()


Does data correlated


In [7]:
fig = plt.figure(num=None, figsize=(6, 6), dpi=180, facecolor='w', edgecolor='k')
ax = fig.add_subplot(1,1,1) # two rows, one column, first plot
corr = df_match.corr()
divider = make_axes_locatable(ax)
cax = divider.append_axes('right', size='5%', pad=0.05)
labels = [ l[:15] + "..."+l[-4:] if len(l) > 20 else l for l in df_match.columns  ]
sns.set(font_scale=0.5)
h = sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax, cbar_ax=cax, xticklabels=labels, yticklabels=labels)
plt.show()
fig.tight_layout()
fig.savefig('corr.pdf')



In [8]:
sns.pairplot(df_match[['home_team_goal', 'shoton_home','corner_home','possession_home','B365A','BWA']],
            size=1.5)
plt.tight_layout()
plt.savefig('matrix.pdf')



In [9]:
def do_pca(df):
    print (df.columns.values)
    
    standardized = (df - df.mean()) / df.std()
    pca = PCA()
    pca.fit(standardized)
    pcavar = np.arange(len(pca.explained_variance_ratio_))
    apply_pca = lambda pcanr: np.einsum('i,ji->j', pca.components_[pcanr], standardized)
    win  = df_match['home_team_goal'] >  df_match['away_team_goal']
    draw = df_match['home_team_goal'] == df_match['away_team_goal']
    lose = df_match['home_team_goal'] <  df_match['away_team_goal']
    
    print (np.cumsum(pca.explained_variance_ratio_))
    print (pca.explained_variance_ratio_)
    fig, ax = plt.subplots(1, 2, figsize=(6, 3))
    ax = np.reshape(ax, -1)    
    ax[0].plot(pcavar, np.cumsum(pca.explained_variance_ratio_))
    ax[0].set_xlabel('PCA variable')
    ax[0].set_ylabel('Cumulative explained variance ratio')
    ax[0].set_title('a)')
    ax[0].set_ylim([0, 1])
    ax[1].bar(pcavar, pca.explained_variance_ratio_)
    ax[1].set_xlabel('PCA variable')
    ax[1].set_ylabel('Explained variance ratio')
    ax[1].set_xlim([np.min(pcavar)-0.5, np.max(pcavar)+0.5])
    ax[1].set_title('b)')
    fig.tight_layout()
    fig.savefig('pca_variables.pdf')
    
    fig, ax = plt.subplots(1, 1, figsize=(6, 3))
    ax = np.reshape(ax, -1)    
    colors = np.full(len(df_match['away_team_goal']), 'r')
    colors[draw] = 'g';
    colors[lose] = 'b';
    ax[0].scatter(apply_pca(0), apply_pca(1), c=colors, s=0.1, marker='*')
    ax[0].set_xlabel('PCA0')
    ax[0].set_ylabel('PCA1')
    red = mpatches.Patch(color='red', label='Home team win')
    blue = mpatches.Patch(color='blue', label='Away team win')
    green = mpatches.Patch(color='green', label='Draw')
    ax[0].legend(handles=[red, blue, green], labels=['Home team win', 'Away team win', 'Draw'])
    fig.tight_layout()
    fig.savefig('pca_scatter.pdf')
    
    labels = [ l[:15] + "..."+l[-4:] if len(l) > 20 else l for l in standardized.columns  ]
    fig, ax = plt.subplots(1, 2, figsize=(6, 6))
    ax = np.reshape(ax, -1)    
    for i in range(2):
        x = np.arange(len(pca.components_[i]))
        ax[i].barh(x, np.abs(pca.components_[i]), tick_label=labels)
        ax[i].set_xlabel('Absolute values of components of PCA%d   ' % i, fontsize=8)
        ax[i].set_ylim([np.min(x)-0.5, np.max(x)+0.5])
        ax[i].tick_params(axis='both', which='major', labelsize=6)
    fig.tight_layout()
    fig.savefig('pca_components.pdf')
    
    fig, ax = plt.subplots(1, 2, figsize=(6, 6))
    ax = np.reshape(ax, -1)    
    for i in range(2):
        x = np.arange(len(pca.components_[i]))
        ax[i].barh(x, pca.components_[i], tick_label=labels)
        ax[i].set_xlabel('Values of components of PCA%d   ' % i, fontsize=8)
        ax[i].set_ylim([np.min(x)-0.5, np.max(x)+0.5])
        ax[i].tick_params(axis='both', which='major', labelsize=6)
    fig.tight_layout()
    fig.savefig('pca_components_withsign.pdf')
    
#do_pca(df_match)
team_attributes = ['buildUpPlaySpeed_home', 'buildUpPlayDribbling_home', 'buildUpPlayPassing_home',
                      'buildUpPlayPositioningClass_home_isOrganised', 'chanceCreationPassing_home',
                      'chanceCreationCrossing_home', 'chanceCreationShooting_home',
                      'chanceCreationPositioningClass_home_isOrganised', 'defencePressure_home',
                      'defenceAggression_home', 'defenceTeamWidth_home', 'buildUpPlaySpeed_away',
                      'buildUpPlayDribbling_away', 'buildUpPlayPassing_away' ,
                      'buildUpPlayPositioningClass_away_isOrganised', 'chanceCreationPassing_away',
                      'chanceCreationCrossing_away', 'chanceCreationShooting_away',
                      'chanceCreationPositioningClass_away_isOrganised', 'defencePressure_away' ,
                      'defenceAggression_away', 'defenceTeamWidth_away']
#do_pca(df_match.filter(team_attributes + ['stage']))
after_match_attributes = ['home_team_goal', 'away_team_goal', 'possession_home', 'shoton_home', 'shoton_away', 
                      'shotoff_home', 'shotoff_away', 'corner_home', 'corner_away', 'cross_home', 'cross_away', 
                      'foulcommit_home', 'foulcommit_away', 'rcard_home', 'rcard_away', 'ycard_home', 'ycard_away', 
                      'throwin_home', 'throwin_away']
do_pca(df_match.drop(after_match_attributes, axis=1))


['stage' 'B365H' 'B365D' 'B365A' 'BWH' 'BWD' 'BWA' 'IWH' 'IWD' 'IWA' 'LBH'
 'LBD' 'LBA' 'WHH' 'WHD' 'WHA' 'VCH' 'VCD' 'VCA' 'buildUpPlaySpeed_home'
 'buildUpPlayDribbling_home' 'buildUpPlayPassing_home'
 'buildUpPlayPositioningClass_home_isOrganised'
 'chanceCreationPassing_home' 'chanceCreationCrossing_home'
 'chanceCreationShooting_home'
 'chanceCreationPositioningClass_home_isOrganised' 'defencePressure_home'
 'defenceAggression_home' 'defenceTeamWidth_home' 'buildUpPlaySpeed_away'
 'buildUpPlayDribbling_away' 'buildUpPlayPassing_away'
 'buildUpPlayPositioningClass_away_isOrganised'
 'chanceCreationPassing_away' 'chanceCreationCrossing_away'
 'chanceCreationShooting_away'
 'chanceCreationPositioningClass_away_isOrganised' 'defencePressure_away'
 'defenceAggression_away' 'defenceTeamWidth_away']
[ 0.32858837  0.45702043  0.5404545   0.5901506   0.63428029  0.67313183
  0.70760243  0.74045824  0.76714979  0.79277571  0.81680183  0.83925132
  0.85811032  0.87581498  0.89333104  0.90898082  0.92127932  0.93254036
  0.9436326   0.95395209  0.96282704  0.9710805   0.97924366  0.98558946
  0.99090895  0.9926473   0.99420941  0.99533252  0.996228    0.99675506
  0.99723918  0.99770621  0.99809528  0.99846606  0.99876298  0.99903405
  0.99929648  0.99951902  0.99972202  0.99987885  1.        ]
[  3.28588373e-01   1.28432053e-01   8.34340761e-02   4.96961013e-02
   4.41296820e-02   3.88515406e-02   3.44705995e-02   3.28558188e-02
   2.66915429e-02   2.56259230e-02   2.40261190e-02   2.24494909e-02
   1.88589997e-02   1.77046581e-02   1.75160654e-02   1.56497795e-02
   1.22984984e-02   1.12610367e-02   1.10922453e-02   1.03194887e-02
   8.87494757e-03   8.25345561e-03   8.16316361e-03   6.34579911e-03
   5.31948892e-03   1.73835425e-03   1.56210991e-03   1.12311329e-03
   8.95474618e-04   5.27061267e-04   4.84123774e-04   4.67025529e-04
   3.89074842e-04   3.70776788e-04   2.96917564e-04   2.71069957e-04
   2.62427751e-04   2.22542992e-04   2.02997071e-04   1.56836053e-04
   1.21147214e-04]

In [15]:
def do_boxplot(df, name):
    print(df.columns)
    labels = [ l[:15] + "..."+l[-4:] if len(l) > 20 else l for l in df.columns  ]
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    sns.boxplot(data=df, orient="h", palette="Set2", ax=ax)
    ax.set_yticklabels(labels)
    fig.tight_layout()
    fig.savefig('box_%s.pdf' % name)
    
#do_boxplot(df_match.filter(regex='.*[HAD]$'), 'odds')
do_boxplot(df_match.filter(team_attributes).select(lambda x: not re.search('_away|_isOrganised$', x), axis=1), 'team')
do_boxplot(df_match.filter(['B365H', 'B365A', 'B365D'] + after_match_attributes).drop(['possession_home', 'cross_home', 'cross_away'], axis=1), 'misc')


Index(['buildUpPlaySpeed_home', 'buildUpPlayDribbling_home',
       'buildUpPlayPassing_home', 'chanceCreationPassing_home',
       'chanceCreationCrossing_home', 'chanceCreationShooting_home',
       'defencePressure_home', 'defenceAggression_home',
       'defenceTeamWidth_home'],
      dtype='object')
Index(['B365H', 'B365A', 'B365D', 'home_team_goal', 'away_team_goal',
       'shoton_home', 'shoton_away', 'shotoff_home', 'shotoff_away',
       'corner_home', 'corner_away', 'foulcommit_home', 'foulcommit_away',
       'rcard_home', 'rcard_away', 'ycard_home', 'ycard_away', 'throwin_home',
       'throwin_away'],
      dtype='object')

In [11]:
df_match.shape


Out[11]:
(2393, 60)

In [ ]: